This project will explore the following questions:
To answer question 1 we will explore property data for all five boroughs
To answer question 2 we will build, analyze and compare linear models for each borough
The data that we are using from this project come from five separate excel files (one for each borough) and can be obtained from here
The data is for June 2019-May 2020
library(readxl)
library(readr)
library(stringr)
library(dplyr)
library(tidyr)
library(magrittr)
library(ggplot2)
# loading in the data from each excel file setting skip = 4 to avoid header rows
queens <- read_excel("rollingsales_queens.xls", skip = 4)
staten_island <- read_excel("rollingsales_statenisland.xls", skip = 4)
brooklyn <- read_excel("rollingsales_brooklyn.xls", skip = 4)
bronx <- read_excel("rollingsales_bronx.xls", skip = 4)
manhattan <- read_excel("rollingsales_manhattan.xls", skip = 4)
Each Borough in New York has a specific number which we can get from the BOROUGH variable in each dataframe
head(queens$BOROUGH, 1)
[1] "4"
head(staten_island$BOROUGH, 1)
[1] "5"
head(brooklyn$BOROUGH, 1)
[1] "3"
head(bronx$BOROUGH, 1)
[1] "2"
head(manhattan$BOROUGH, 1)
[1] "1"
# The Borough numbers are as follows:
# Manhattan = 1
# Bronx = 2
# Brooklyn = 3
# Queens = 4
# Staten Island = 5
We will now bind the five dataframes into a single dataframe
NYC_property_sales <- bind_rows(manhattan, bronx, brooklyn, queens, staten_island)
Since we have a single dataframe with all NYC property sales, we don’t need the 5 individual dataframes and can remove then using the rm function
rm(brooklyn, bronx, manhattan, queens, staten_island)
Since we have the numbers for each borough we can replace them with the borough names so that it is more clear
NYC_property_sales <- NYC_property_sales %>% mutate(BOROUGH =
case_when(BOROUGH == 1 ~ "Manhattan",
BOROUGH == 2 ~ "Bronx",
BOROUGH == 3 ~ "Brooklyn",
BOROUGH == 4 ~ "Queens",
BOROUGH == 5 ~ "Staten Island"))
Now that we have our data in a single dataframe with the BOROUGH column taken care of there are a few more steps to get this into the format that we need
# Convert all column names to lowercase and replace spaces with underscores
colnames(NYC_property_sales) %<>% str_replace_all("\\s", "_") %>% tolower()
# Convert CAPTIALIZED case to Title case (the NEIGHBORHOOD,BUILDING CLASS CATEGORY and ADDRESS columns)
NYC_property_sales <- NYC_property_sales %>% mutate(neighborhood = str_to_title(neighborhood)) %>% mutate(building_class_category = str_to_title(building_class_category)) %>% mutate(address = str_to_title(address))
# Drop the ease-ment column as it has no data and only select distinct values
NYC_property_sales <- NYC_property_sales %>% select(-`ease-ment`) %>% distinct()
# Filter out property exchanges between family members with a threshold of 10,000
NYC_property_sales <- NYC_property_sales %>% filter(sale_price > 10000) %>%
# Remove properties that have gross_square_feet of 0
filter(gross_square_feet > 0) %>%
# Remove NA values in both sale_price and gross_square_feet
drop_na(c(gross_square_feet, sale_price))
# Arrange by borough and neighborhood
NYC_property_sales <- NYC_property_sales %>% arrange(borough, neighborhood)
We will save it to a csv file
write_csv(NYC_property_sales, "NYC_property_sales.csv")
As I found out for condominums gross_square_feet isn’t being collected for condominiums (this uses the latest data obtained on July 20th 2020) so we need to find out what building types we can use
sort(table(NYC_property_sales$building_class_at_present))
GW H7 H8 H9 I3 J2 M4 P7 Q2 Q8 W1 W6 Z3 Z5 C6 G3 G4 H4 HB I1 I7 I9 J9 K3 K8 P5 P6 Q1 Q9
1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
D8 G5 G8 H2 N2 S0 W3 W7 W8 D5 F2 P2 O3 P9 C9 GU I6 M3 W9 K6 N9 A7 W2 H1 H3 K5 O9 E7 RR
3 3 3 3 3 3 3 3 3 4 4 4 5 5 6 6 6 6 6 7 7 8 8 9 9 10 10 11 11
E2 M9 D9 F1 I5 K7 O1 D3 F4 G9 D6 F9 D7 O4 O8 O6 O5 O2 A6 O7 E9 F5 M1 S4 S5 G2 G1 S3 Z9
12 13 14 15 16 16 16 17 20 20 21 21 23 28 30 31 32 34 35 35 43 44 45 48 50 52 53 56 56
S9 C5 D1 C4 K2 E1 A4 C7 K4 S1 K1 A3 C1 C2 A0 S2 C3 A9 B9 A2 C0 B3 B1 B2 A5 A1
61 66 70 71 85 100 115 122 138 139 173 207 222 230 272 323 363 707 766 1501 1714 1986 2321 2521 3406 4070
The most listed is type A1 which according to the building codes is this: TWO STORIES - DETACHED SM OR MID
According to further research found here it is a single familiy home and thus we will need to include land_square_feet
NYC_property_sales <- NYC_property_sales %>% filter(land_square_feet > 0) %>% drop_na(land_square_feet)
We will now filter according to A1
NYC_sm_md_two_story <- NYC_property_sales %>% filter(building_class_at_time_of_sale == "A1")
head(NYC_sm_md_two_story)
ggplot(data = NYC_sm_md_two_story,
Warning messages:
1: In readChar(file, size, TRUE) : truncating string with embedded nuls
2: In readChar(file, size, TRUE) : truncating string with embedded nuls
3: In readChar(file, size, TRUE) : truncating string with embedded nuls
4: In readChar(file, size, TRUE) : truncating string with embedded nuls
aes(x = gross_square_feet, y = sale_price)) +
geom_point(aes(color = borough), alpha = 0.3) +
scale_y_continuous(labels = scales::comma, limits = c(0,10000000)) +
xlim(0, 10000) +
geom_smooth(method = "lm", se = FALSE) +
theme_minimal() +
labs(title = "Single Family Home Sale Price increases with Size",
x = "Size (Gross Square Feet)",
y = "Sale Price (USD)")
Let’s adjust our y limits by looking at the maximum sale price
max(NYC_sm_md_two_story$sale_price)
[1] 9e+06
We see in general as gross square feet increases sale price increases
Let’s look at land square feet to see if that holds true
ggplot(data = NYC_sm_md_two_story,
aes(x = land_square_feet, y = sale_price)) +
geom_point(aes(color = borough), alpha = 0.3) +
scale_y_continuous(labels = scales::comma, limits = c(0,10000000)) +
xlim(0, 10000) +
geom_smooth(method = "lm", se = FALSE) +
theme_minimal() +
labs(title = "Single Family Home Sale Price increases with Size",
x = "Land Size (Square Feet)",
y = "Sale Price (USD)")
We see this also holds up for land square feet (which is included in gross square feet as homes include the square footage of the house as well as land)
We will also zoom in on a smaller subset of data with a max sale price of $2,500,000 and a max gross square feet of 5000
ggplot(data = NYC_sm_md_two_story,
aes(x = gross_square_feet, y = sale_price)) +
geom_point(aes(color = borough), alpha = 0.3) +
scale_y_continuous(labels = scales::comma, limits = c(0,2500000)) +
xlim(0, 5000) +
geom_smooth(method = "lm", se = FALSE) +
theme_minimal() +
labs(title = "Single Family Home Sale Price increases with Size",
x = "Size (Gross Square Feet)",
y = "Sale Price (USD)")
Zooming in also shows for single family homes as the size increases so does the price
Looking at land values
ggplot(data = NYC_sm_md_two_story,
aes(x = land_square_feet, y = sale_price)) +
geom_point(aes(color = borough), alpha = 0.3) +
scale_y_continuous(labels = scales::comma, limits = c(0,2500000)) +
xlim(0, 5000) +
geom_smooth(method = "lm", se = FALSE) +
theme_minimal() +
labs(title = "Single Family Home Sale Price increases with Size",
x = "Land Size (Square Feet)",
y = "Sale Price (USD)")
This trend also shows that as land size increases the sale price also increaes
However, these graphs are cluttered in that all five boroughs are shown.
ggplot(data = NYC_sm_md_two_story,
aes(x = gross_square_feet, y = sale_price)) +
geom_point(alpha = 0.3) +
facet_wrap(~ borough, scales = "free", ncol = 2) +
scale_y_continuous(labels = scales::comma) +
geom_smooth(method = "lm", se = FALSE) +
theme_minimal() +
labs(title = "Single Family Home Sale Price increases with Size",
x = "Size (Gross Square Feet)",
y = "Sale Price (USD)")
For each borough we see that homes with larger gross square feet have higher sale prices
To see any outliers we first need to sort sale prices by highest to lowest
NYC_sm_md_two_story %>% arrange(desc(sale_price)) %>% head()
Warning messages:
1: In readChar(file, size, TRUE) : truncating string with embedded nuls
2: In readChar(file, size, TRUE) : truncating string with embedded nuls
3: In readChar(file, size, TRUE) : truncating string with embedded nuls
4: In readChar(file, size, TRUE) : truncating string with embedded nuls
5: In readChar(file, size, TRUE) : truncating string with embedded nuls
5 of the top sellers are in Brooklyn with one in Queens. The highest price of $9,000,000 was mostly land (looking on realtor.com doesn’t show a house but zillow.com does show a house there)
NYC_sm_md_two_story %>% filter(borough == "Brooklyn") %>% arrange(desc(gross_square_feet)) %>% head()
The largest gross square foot property was converted from a 3 familiy apartment to a single family home
NYC_sm_md_two_story %>% arrange(desc(gross_square_feet)) %>% head()
However looking at gross square feet here shows a different story but then it depends on the borough the property is in.
We will not remove the outlier for now as we want to confirm the sale is legitimate
We will now generate a linear model using sale_price explained by gross_square_feet and land_square_feet
NYC_sm_md_two_story_lm <- lm(sale_price ~ gross_square_feet + land_square_feet, data = NYC_sm_md_two_story)
Warning messages:
1: In readChar(file, size, TRUE) : truncating string with embedded nuls
2: In readChar(file, size, TRUE) : truncating string with embedded nuls
3: In readChar(file, size, TRUE) : truncating string with embedded nuls
4: In readChar(file, size, TRUE) : truncating string with embedded nuls
5: In readChar(file, size, TRUE) : truncating string with embedded nuls
summary(NYC_sm_md_two_story_lm)
Call:
lm(formula = sale_price ~ gross_square_feet + land_square_feet,
data = NYC_sm_md_two_story)
Residuals:
Min 1Q Median 3Q Max
-1491507 -157622 -24579 108987 6836803
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.346e+05 1.429e+04 9.420 <2e-16 ***
gross_square_feet 3.272e+02 9.367e+00 34.932 <2e-16 ***
land_square_feet 6.489e+00 2.563e+00 2.532 0.0114 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 321300 on 4076 degrees of freedom
Multiple R-squared: 0.2975, Adjusted R-squared: 0.2972
F-statistic: 863.2 on 2 and 4076 DF, p-value: < 2.2e-16
If we remove the highest listing
NYC_sm_md_two_story_modified <- NYC_sm_md_two_story %>% filter(address != "704 Avenue I")
NYC_sm_md_two_story_modified_lm <- lm(sale_price ~ gross_square_feet + land_square_feet, data = NYC_sm_md_two_story_modified)
summary(NYC_sm_md_two_story_modified_lm)
Call:
lm(formula = sale_price ~ gross_square_feet + land_square_feet,
data = NYC_sm_md_two_story_modified)
Residuals:
Min 1Q Median 3Q Max
-1369545 -155512 -26950 106644 3692512
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.635e+05 1.352e+04 12.087 < 2e-16 ***
gross_square_feet 3.027e+02 8.888e+00 34.061 < 2e-16 ***
land_square_feet 8.960e+00 2.416e+00 3.708 0.000212 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 302600 on 4075 degrees of freedom
Multiple R-squared: 0.2955, Adjusted R-squared: 0.2951
F-statistic: 854.6 on 2 and 4075 DF, p-value: < 2.2e-16
Removing the highest listing caused a reduction in the estimate for gross_square_feet but increased land_square_feet. For standard error it was reduced for both gross_square_feet and land_square_feet
Replotting using facet wrap without the outlier
ggplot(data = NYC_sm_md_two_story_modified,
aes(x = gross_square_feet, y = sale_price)) +
geom_point(alpha = 0.3) +
facet_wrap(~ borough, scales = "free", ncol = 2) +
scale_y_continuous(labels = scales::comma) +
geom_smooth(method = "lm", se = FALSE) +
theme_minimal() +
labs(title = "Single Family Home Sale Price increases with Size",
x = "Size (Gross Square Feet)",
y = "Sale Price (USD)")
We have a linear model for all of the boroughs but would be more useful is if we had a model for each one. We will use the dataset that contains the outlier as that is still a sale (and was confirmed through realtor.com)
We will use the broom workflow which is the following:
nest() function from the tidyr package - we will nest by borough.map() function from the purrr package.broom functions tidy(), augment(), and/or glance() using each nested model - we’ll work with tidy() first.unnest() function - this allows us to see the results.library(broom)
library(tidyr)
library(purrr)
Attaching package: ă¤¼ă¸±purrră¤¼ă¸²
The following object is masked from ă¤¼ă¸±package:magrittră¤¼ă¸²:
set_names
NYC_sm_md_two_story_nested <- NYC_sm_md_two_story %>% group_by(borough) %>% nest()
print(NYC_sm_md_two_story_nested)
Looking at Manhattan we have
head(NYC_sm_md_two_story_nested$data[[3]])
We will now fit a linear model to each dataframe (borough)
NYC_sm_md_two_story_coefficients <- NYC_sm_md_two_story %>% group_by(borough) %>% nest() %>%
mutate(linear_model = map(.x = data,
.f = ~lm(sale_price ~ gross_square_feet + land_square_feet, data = .)
))
print(NYC_sm_md_two_story_coefficients)
Looking at the summary for Manhattan
summary(NYC_sm_md_two_story_coefficients$linear_model[[3]])
Call:
lm(formula = sale_price ~ gross_square_feet + land_square_feet,
data = .)
Residuals:
ALL 1 residuals are 0: no residual degrees of freedom!
Coefficients: (2 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 649000 NA NA NA
gross_square_feet NA NA NA NA
land_square_feet NA NA NA NA
Residual standard error: NaN on 0 degrees of freedom
That is because there is only a single entry for Manhattan looking at Brooklyn
summary(NYC_sm_md_two_story_coefficients$linear_model[[2]])
Call:
lm(formula = sale_price ~ gross_square_feet + land_square_feet,
data = .)
Residuals:
Min 1Q Median 3Q Max
-1514579 -244635 -31472 187012 5551040
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -108124.95 67684.87 -1.597 0.1108
gross_square_feet 519.06 39.73 13.063 <2e-16 ***
land_square_feet 66.31 26.35 2.516 0.0122 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 516800 on 494 degrees of freedom
Multiple R-squared: 0.4289, Adjusted R-squared: 0.4265
F-statistic: 185.5 on 2 and 494 DF, p-value: < 2.2e-16
Transforming this dataframe into a tidy format
NYC_sm_md_two_story_coefficients <- NYC_sm_md_two_story %>% group_by(borough) %>% nest() %>%
Warning messages:
1: In readChar(file, size, TRUE) : truncating string with embedded nuls
2: In readChar(file, size, TRUE) : truncating string with embedded nuls
3: In readChar(file, size, TRUE) : truncating string with embedded nuls
4: In readChar(file, size, TRUE) : truncating string with embedded nuls
5: In readChar(file, size, TRUE) : truncating string with embedded nuls
6: In readChar(file, size, TRUE) : truncating string with embedded nuls
mutate(linear_model = map(.x = data,
.f = ~lm(sale_price ~ gross_square_feet + land_square_feet, data = .)
)) %>%
mutate(tidy_coefficients = map(.x = linear_model,
.f = tidy,
conf.int = TRUE))
NaNs produced
NYC_sm_md_two_story_coefficients
First inspect for Brooklyn
print(NYC_sm_md_two_story_coefficients$tidy_coefficients[[2]])
And let’s get a tidy dataframe out using unest()
NYC_sm_md_two_story_tidy <- NYC_sm_md_two_story_coefficients %>%
select(borough, tidy_coefficients) %>%
unnest(cols = tidy_coefficients)
print(NYC_sm_md_two_story_tidy)
Gross square feet has a higher estimate but our models include both as these are homes and not condominiums. However, we will use gross square feet as that has a higher estimate
NYC_sm_md_two_story_slope <- NYC_sm_md_two_story_tidy %>% filter(term == "gross_square_feet") %>% arrange(estimate)
print(NYC_sm_md_two_story_slope)
The highest estimate for gross square feet is in Brooklyn (which also has the highest sale price)
Looking at land square feet
NYC_sm_md_two_story_slope_land <- NYC_sm_md_two_story_tidy %>% filter(term == "land_square_feet") %>% arrange(estimate)
print(NYC_sm_md_two_story_slope_land)
This also confirms that brooklyn has the highest estimate for land square feet
We will apply the same workflow to generate tidy summary statistics for each of the five boroughs
NYC_sm_md_two_story_summary <- NYC_sm_md_two_story %>%
There were 24 warnings (use warnings() to see them)
group_by(borough) %>%
nest() %>% mutate(linear_model = map(.x = data,
.f = ~lm(sale_price ~ gross_square_feet + land_square_feet, data = .))) %>%
mutate(tidy_summary_stats = map(.x = linear_model,
.f = glance))
print(NYC_sm_md_two_story_summary)
NA
Now let’s use the tidy_summary_stats to get our conclusion
NYC_sm_md_two_story_tidy <- NYC_sm_md_two_story_summary %>%
Warning messages:
1: In readChar(file, size, TRUE) : truncating string with embedded nuls
2: In readChar(file, size, TRUE) : truncating string with embedded nuls
3: In readChar(file, size, TRUE) : truncating string with embedded nuls
4: In readChar(file, size, TRUE) : truncating string with embedded nuls
5: In readChar(file, size, TRUE) : truncating string with embedded nuls
6: In readChar(file, size, TRUE) : truncating string with embedded nuls
select(borough, tidy_summary_stats) %>%
unnest(cols = tidy_summary_stats) %>%
arrange(r.squared)
print(NYC_sm_md_two_story_tidy)
We can see that gross square feet and land square feet are good predictors of sales prices with the highest R squared value in Brooklyn followed by Staten Island and the Bronx. This is the lower in Queens and Manhattan has a value of 0 simply because there was a single A1 property sale there.